package org.zty.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
/**
 * 截取HTML代码
 * 
 * @author YangJunping
 * @date 2010-7-15
 */
public class HtmlUtil {
	public static void main(String[] args) {
		StringBuffer htmlStr = new StringBuffer();
		htmlStr.append("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>")
		       .append("<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en'><head><title>aaa</title><mce:script type='text/javascript'></mce:script>")
		       .append("<link href=static_files/help.css mce_href=static_files/help.css rel='stylesheet' type='text/css' media='all' />")
		       .append("</head><body><ul><li>汉字</li></ul><a><!-- 这是 --> <table><tr><td>fdsfd</td></tr><tr><td>fdsfd</td></tr></table>  <br/>fdsf");
		
		htmlStr.append("<p>aaaaaa</p><table cellspacing=\"0\" cellpadding=\"0\" width=\"680\" align=\"center\" border=\"1\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">    <tbody>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">采购公告标题：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">珙县林业局公开招标采购复合肥公告</td>        </tr>        <tr>            <td width=\"23%\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;采购项目名称：</td>            <td width=\"34%\" colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">珙县林业局公开招标采购复合肥公告</td>        </tr>        <tr>            <td width=\"23%\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;预审公告：</td>            <td width=\"34%\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">无</td>            <td width=\"18%\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">采购方式：</td>            <td width=\"25%\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">公开招标</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;招标编号：</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">珙采公[2009]04号</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">公告日期：</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">2009年12月10日15时44分</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;行政区划：</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">宜宾市－珙　县</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">采购包个数：</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">1个</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;采 购 人：</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">珙县林业局</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">更正公告：</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">无</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;委托招标单位/<br />            &nbsp;采购中介机构名称：</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">中介机构编码：</td>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;</td>        </tr>        <tr>            <td colspan=\"4\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">            <table cellspacing=\"0\" cellpadding=\"0\" width=\"100%\" align=\"center\" border=\"1\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">                <tbody>                    <tr>                        <td colspan=\"2\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">包号：1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;类别：货物&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;采购单位：珙县林业局</td>                    </tr>                    <tr>                        <td width=\"20%\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">包的描述：</td>                        <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">复合肥</td>                    </tr>                    <tr>                        <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">该包技术指标：</td>                        <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;</td>                    </tr>                </tbody>            </table>            <br />            <br />            　　</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;供应商资格要求：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;投标人应具备的资格条件：<br />            1、在中国境内注册并具有独立法人资格且经营范围具有经营农资肥料的合法企业；<br />            2、具有良好的商业信誉和健全的财务会计制度；<br />            3、具有履行合同所必须的设备和专业技术能力；<br />            4、具有依法缴纳税收和社会保障资金的良好记录；<br />            5、参加本次政府采购活动前三年内，在经营活动中没有重大违法违规记录.</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;标书发售方式：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">现场出售</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;标书发售起止时间：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">2009年12月11日至2009年12月18日上午10时。</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;标书售价：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">人民币400元/份（招标文件售后不退, 投标资格不能转让）。</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;标书发售地点：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">珙县人民政府采购中心（珙县政务中心四楼）</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;投标区Email：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;投标截止日期：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">2009年12月30日15时0分</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;投标地点：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">珙县政务中心四楼开标室</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;开标日期：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">2009年12月30日15时0分</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;开标地点：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">珙县政务中心四楼开标室</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;公开答疑会时间：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;公开答疑会地点：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;联系人/联系方式：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">联 系 人：周永西 罗家聪 <br />            联系电话：0831--4012683</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;其它内容：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;</td>        </tr>        <tr>            <td style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">&nbsp;备&nbsp;&nbsp;&nbsp;&nbsp;注：</td>            <td colspan=\"3\" style=\"border-right: black 1px solid; border-top: black 1px solid; border-left: black 1px solid; border-bottom: black 1px solid; border-collapse: collapse\">购买标书时请带介绍信及相关资质原件。</td>        </tr>    </tbody></table>");
		htmlStr.append("<?xml:namespace prefix = st1 ns = \"urn:schemas-microsoft-com:office:smarttags\" //><st1:chsdate w:st=\"on\" IsROCDate=\"False\" IsLunarDate=\"False\" Day=\"18\" Month=\"12\" Year=\"2003\"/>dsfafds<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" /><o:p/></o:p/></SPAN/></SPAN/></P/>");
		htmlStr.append("</body></html>");
		System.out.println(Html2Text(htmlStr.toString()));
	}
	
	/**
     * 抽取纯文本信息
     * 
     * @param inputHtml
     * @return
     */
    public static String Html2Text(String inputHtml){
    	String str = "";
    	if(StringUtils.isNotBlank(inputHtml)){
	    	Parser parser = new Parser();
	    	try {
				parser.setInputHTML(inputHtml);
				StringBean sb = new StringBean();
				sb.setLinks(false);
				sb.setReplaceNonBreakingSpaces(true);
				sb.setCollapse(true);
				parser.visitAllNodesWith(sb);
				str = clearWordFormat(sb.getStrings()==null?"":sb.getStrings());
			} catch (ParserException e) {
				e.printStackTrace();
			}
    	}
        return str;
    }
    
    //清楚word粘贴的格式 
    public static String clearWordFormat(String content){
       Pattern p=Pattern.compile("<\\\\?\\?xml[^>]*>");
       Matcher match=p.matcher(content);
       content=match.replaceAll("");
       p=Pattern.compile("<\\/?\\w+:[^>]*>");
       match=p.matcher(content);
       content=match.replaceAll("");
       p=Pattern.compile("/ /");
       match=p.matcher(content);
       content=match.replaceAll(" ");
       return content;
       }
    
	/*public static String Html2Text(String inputString) {    
        String htmlStr = inputString; // 含html标签的字符串    
        String textStr = "";    
        java.util.regex.Pattern p_script;    
        java.util.regex.Matcher m_script;    
        java.util.regex.Pattern p_style;    
        java.util.regex.Matcher m_style;    
        java.util.regex.Pattern p_html;    
        java.util.regex.Matcher m_html;    
  
        java.util.regex.Pattern p_html1;    
        java.util.regex.Matcher m_html1;    
  
       try {    
            String regEx_script = "<[//s]*?script[^>]*?>[//s//S]*?<[//s]*?///[//s]*?script[//s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[//s//S]*?<///script>    
            String regEx_style = "<[//s]*?style[^>]*?>[//s//S]*?<[//s]*?///[//s]*?style[//s]*?>"; // 定义style的正则表达式{或<style[^>]*?>[//s//S]*?<///style>    
            String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式    
            String regEx_html1 = "<[^>]+";    
            p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);    
            m_script = p_script.matcher(htmlStr);    
            htmlStr = m_script.replaceAll(""); // 过滤script标签    
  
            p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);    
            m_style = p_style.matcher(htmlStr);    
            htmlStr = m_style.replaceAll(""); // 过滤style标签    
  
            p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);    
            m_html = p_html.matcher(htmlStr);    
            htmlStr = m_html.replaceAll(""); // 过滤html标签    
  
            p_html1 = Pattern.compile(regEx_html1, Pattern.CASE_INSENSITIVE);    
            m_html1 = p_html1.matcher(htmlStr);    
            htmlStr = m_html1.replaceAll(""); // 过滤html标签    
  
            htmlStr = htmlStr.replace("&nbsp;", "");
            
            textStr = htmlStr;    
  
        } catch (Exception e) {    
            System.err.println("Html2Text: " + e.getMessage());    
        }    
  
       return textStr;// 返回文本字符串    
   }   */ 
}

